2024 Dev/GlowwormScale.R

#' @title GlowwormScale
#' @description Extract data from Seurat file or Counts Matrix, run normalization and background correction. Perform iterative comparison of target gene list compared to randomly selected backround gene list.
#' @param Input Seurat file or counts matrix (as dataframe, matrix or sparse matrix) with genes as row names and cell barcodes as column names
#' @param MetaData A dataframe of metadata with barcodes as row names. Default pulls from the Seurat metadata slot
#' @param ClassName A string containing the column name from the metadata corresponding to the cell class annotation (i.e. Blood)
#' @param SubclassName A string containing the column name from the metadata corresponding to the cell subclass annotation (i.e. CD4+ T Cell). Default is NA.
#' @param Genes A vector of genes you wish to analyze
#' @param GlowwormGenesOutput Results from running GlowwormGenesOutput, including the slot of either 'UniqueGenesinWindow' or 'NearestNeighborGene' 
#' @param Verbose Boolean to print progress. Default is TRUE.
#' @param SecondaryGenes The number of secondary gebes to be used in each background list. Default is equal to the length of the input gene list. [[A previously run GlowwormScale object.]]
#' @param BackgroundCorrect Boolean to use background correction. Default is TRUE.
#' @param PercentCutoff To remove background signal, clusters which express a candidate gene in less than a
#' percentage of cells will be reduced to zero. Default is 0.1 (10%)
#' @param seed Seed for selecting the background gene list. Default is 10.
#' @param Iterations Number of background iterations. Default is 5.
#' @param ScaleType Use either 'MinMax' or 'ZScore' scaling. Default is MinMax.
#' @param ConservedGenes If using multiple datasets, input a list of shared protein coding genes across all datasets.
#' @return This function takes a Seurat or Counts matrix, extracts data and merges with metadata.
#' This data is scaled and has background correction \cr \cr Exports a list containing:
#' \itemize{
#'     \item MetaData - A copy of the metadata
#'     \item GlowwormScaleOutput - A scaled data frame of n genes by n populations in metadata
#'     \item RankScores - A data frame indicating the significantly enriched cell populations in the target gene list compared to the background lust
#'     \item SumStats - A data frame showing expression level and specificity for each gene.
#'     \item Settings - A list of settings
#'         }
#' @importFrom matrixStats rowSds
#'
#' @export


GlowwormScale = function(Input, MetaData = Input@meta.data, ClassName, SubclassName = NA, Genes, GlowwormGenesOutput = NA, PercentCutoff = 0.10, Verbose = T, SecondaryGenes = length(Genes), BackgroundCorrect =T, seed = 10, ConservedGenes, ScaleType = "MinMax", Iterations = 5){
  
  #Select background genes
  if(any(Genes == names(GlowwormGenesOutput))){GeneList = GlowwormGenesOutput[[Genes]]}else if(is.na(GlowwormGenesOutput)){GeneList = Genes}else{stop("If using GlowwormGenes to determine gene inputs the GlowwormGenesOutput parameter should be set to the output of GlowwormGenes, and the Genes parameter should indicate 'UniqueGenesinWindow' or 'NearestNeighborGene'")
  }
  
  #Check candidate genes are non zero - need to be fixed
  if(length(unique(GeneList))== 0){stop("No genes in input - check results of GlowwormGenesOutput")}  
  
  if(Verbose == T){cat("| 0%     20%     40%    60%    80%    100% |\n| ##")}
  #Reorganize Metdata
  MetaData <- dplyr::rename(MetaData, Class = all_of(ClassName))
  if(is.na(SubclassName)){
    MetaDataComb = MetaData %>% dplyr::select(Class)
  }else{
    MetaData <- dplyr::rename(MetaData, Subclass = all_of(SubclassName))
    MetaDataComb = MetaData %>% dplyr::select(Class, Subclass)
  }
  OutputList = list()
  OutputList[["MetaData"]] = MetaDataComb
  if(is.na(SubclassName ) == F){
    MetaDataComb$Combined = paste(MetaDataComb[["Subclass"]], MetaDataComb[["Class"]], sep="|")
    MetaDataComb[["Subclass"]] = NULL
    MetaDataComb[["Class"]] = NULL
  }else{
    MetaDataComb$Combined = paste(MetaDataComb[["Class"]], MetaDataComb[["Class"]], sep="|")
    MetaDataComb[["Class"]] = NULL
  }
  if(Verbose == T){cat("##")}
  
  
  #Organize Genes
  if(class(Input) =="Seurat"){
    GetGenes = subset(GeneList, GeneList %in% row.names(Input@assays$RNA@counts))
    GenesNotIn = subset(GeneList, ! GeneList %in% row.names(Input@assays$RNA@counts))
    RemainingGenes = subset(row.names(Input@assays$RNA@counts), ! row.names(Input@assays$RNA@counts) %in% GeneList)
  }else if(class(Input) %in% c("dgCMatrix", "matrix", "data.frame")){
    GetGenes = subset(GeneList, GeneList %in% row.names(Input))
    GenesNotIn = subset(GeneList, ! GeneList %in% row.names(Input))  
    RemainingGenes = subset(row.names(Input), ! row.names(Input) %in% GeneList)
  }else{stop("Input file needs to be a Seurat File, or a matrix, sparse matrix or data frame with genes as row names and barcodes as column names")}
  if(Verbose == T){cat("##")}
  
  Settings = list("InputFile" = as.character(substitute(Input)),
                  "TargetGenes" = GeneList,
                  "MissingTargetGenes" = GenesNotIn,
                  #"SecondaryGenes" = list(),
                  "N_Classes" = length(unique(OutputList[["MetaData"]]$Class)),
                  "N_Subclasses" = length(unique(OutputList[["MetaData"]]$Subclass)),
                  "N_TotalClusters" = length(unique(MetaDataComb$Combined)),
                  "PercentCutoff" = PercentCutoff)  
  
  SecGenesList = list()
  CompileSecondary = as.character()
  #Select background genes
  if(class(SecondaryGenes) == "list"){
    SecGenesList = SecondaryGenes
    SecGenesNotIn =character()
    for(iter in names(SecGenesList)){
      MissingSec = subset(SecGenesList[[iter]], ! SecGenesList[[iter]] %in% RemainingGenes)
      SecGenesNotIn = unique(c(SecGenesNotIn, MissingSec))
      SecGenesList[[iter]] = subset(SecGenesList[[iter]], SecGenesList[[iter]] %in% RemainingGenes)
      CompileSecondary = unique(c(CompileSecondary, SecGenesList[[iter]]))
      BackgroundType = "selected"
    }
  }else if(class(SecondaryGenes) %in% c("numeric", "integer") & SecondaryGenes > 0){
    BackgroundType = "randomly generated"
    for(iter in c(1:Iterations)){
      set.seed(seed+iter)
      SecGenesList[[paste0("Iteration", iter)]] = sort(sample(RemainingGenes, SecondaryGenes, replace=F)) 
      CompileSecondary = unique(c(CompileSecondary, SecGenesList[[paste0("Iteration", iter)]]))
      SecGenesNotIn = NULL    
    }
    Settings[["SecondaryGenes"]] = SecGenesList
    Settings[["UniqueSecondaryGenes"]] = CompileSecondary
  }else if(length(SecondaryGenes) == 0){stop("SecondaryGenes should contain either the number of randomly selected background genes to use genes, or a string of genes to use as secondary genes. At least an equal number is recommended, but default is set to 100")}
  if(Verbose == T){cat("##")}
  
  
  #Pull data and scale and perform background correction
  AllGenes = c(GetGenes, CompileSecondary)
  Sections = ceiling(length(AllGenes)/50)
  
  if(Sections == 1){UpdateAt = 1; Sign = "####################"
  }else if(Sections %in% c(2,3)){UpdateAt = c(1,2); Sign = "##########"
  }else if(Sections == 4){UpdateAt = seq(1,4,1); Sign = "#####"  
  }else if(Sections %in% c(5,6)){UpdateAt = seq(1,5,1); Sign = "####"  
  }else if(Sections >= 7 & Sections <= 8){UpdateAt = c(1,3,4,5,7); Sign = "####" 
  }else if(Sections == 9){UpdateAt = c(1,3,5,7,9); Sign = "####" 
  }else if(Sections >= 10 & Sections <= 11){UpdateAt = seq(1,10,1); Sign = "##" 
  }else if(Sections >= 12 & Sections <= 13){UpdateAt = c(1,2,3,5,6,7,8,10,11,12); Sign = "##" 
  }else if(Sections >= 14 & Sections <= 15){UpdateAt = c(1,2,3,5,7,8,10,11,13,14); Sign = "##" 
  }else if(Sections >= 16 & Sections <= 17){UpdateAt = c(1,2,3,5,6,8,9,10,12,13,15,16); Sign = "##" 
  }else if(Sections >= 18 & Sections <= 19){UpdateAt = c(1,2,3,4,6,7,8,10,11,13,14,15,17,18); Sign = "##" 
  }else if(Sections > 20){UpdateAt = seq(round(Sections/20), Sections, round(Sections/20)); Sign = "#"}
  
  scDataList = list()
  StartPt = 1
  for(x in 1:Sections){
    SectionGenes = na.omit(AllGenes[StartPt:(StartPt+49)])
    StartPt = StartPt+50  
    if(class(Input) =="Seurat"){
      scData =  Input@assays$RNA@counts[SectionGenes, row.names(MetaData)]
    }else if(class(Input) %in% c("dgCMatrix", "matrix", "data.frame")){
      scData =  Input[SectionGenes, row.names(MetaData)]
    }
    if(length(SectionGenes) == 1){
      if(ScaleType == "ZScore"){
        scData = (scData-mean(scData))/sd(scData) #Zscore
      }else if(ScaleType == "MinMax"){
        scData = (scData- min(scData)) /(max(scData)-min(scData)) #
      }
      scData = as.matrix(scData)
      colnames(scData) = SectionGenes
    }else{
      if(ScaleType == "ZScore"){
        scData = apply(scData, 1, function(x) (x-mean(x))/sd(x)) #Zscore
      }else if(ScaleType == "MinMax"){
        scData = apply(scData, 1, function(x) (x- min(x)) /(max(x)-min(x))) #
      }}
    scData[is.na(scData)] = 0
    scData = merge(scData, MetaDataComb, by = 0)
    row.names(scData) = scData$Row.names; scData$Row.names = NULL
    if(BackgroundCorrect == F){
      if(x != 1){ #Scale but no background correction
        scData$Combined = NULL}
      scDataList[[x]] = scData
    }else if(BackgroundCorrect == T){
      scData[scData <= 0] <- 0
      scData_Pcts = as.data.frame(scData %>% group_by(Combined) %>% summarise_all(function(y){ 
        sum(as.numeric(y) > 0)/n()}))
      for(z in SectionGenes){
        ClustToReset = subset(scData_Pcts, scData_Pcts[[z]] > 0 & scData_Pcts[[z]] < PercentCutoff)
        scData[[z]][scData$Combined %in% ClustToReset$Combined] <- 0
      }
      if(x != 1){ scData$Combined = NULL}
      scDataList[[x]] = scData
    }    
    if(Verbose == T & x %in% UpdateAt){cat(Sign)}
  } 
  
  #Combine data and format for downstream use
  scDataAll = as.data.frame(scDataList, optional=T)
  if(length(c(GenesNotIn, SecGenesNotIn))>0){
    for(x in c(GenesNotIn, SecGenesNotIn)){
      scDataAll[[x]] = 0  
    }}
  OutputList[["GlowwormScaleAllCells"]] = scDataAll
  
  if(Verbose == T){cat("##")}
  
  #Concatenate data for reduced version
  scDataReduced = scDataAll %>% group_by(Combined) %>% summarise_all(mean, na.rm = TRUE) %>% as.data.frame()
  row.names(scDataReduced) = scDataReduced$Combined
  scDataReduced$Combined = NULL
  OutputList[["GlowwormScaleOutput"]] = scDataReduced
  
  if(Verbose == T){cat("##")}
  
  
  #Generate summary statistics    
  scDataReducedTarget = scDataReduced %>% dplyr::select(GetGenes)
  SumStats = as.data.frame(colSums(scDataReducedTarget))
  colnames(SumStats) = "Expression"
  SumStats$StDev = colSds(as.matrix(scDataReducedTarget))
  
  SumStats[SumStats==Inf] <- 0
  
  SumStats$Expression = (SumStats$Expression- min(SumStats$Expression))/(max(SumStats$Expression) - min(SumStats$Expression))
  SumStats$StDev = (SumStats$StDev- min(SumStats$StDev))/(max(SumStats$StDev) - min(SumStats$StDev))
  SumStats$RankScore = SumStats$Expression + SumStats$StDev
  SumStats = SumStats[order(-SumStats$RankScore),]
  
  if(Verbose == T){cat("##")}
  
  #T-tests  
  IterTTest = list()
  for(iter in 1:Iterations){
    i = 0
    CompileTRes = as.data.frame(matrix(ncol = 5, nrow=0))
    colnames(CompileTRes) = c("Combined", "P.val", "meanDiff")
    AllPops = ceiling(length(unique(scDataAll$Combined))/40)
    Progress =seq(1,length(unique(scDataAll$Combined)), AllPops)
    
    for(Comb in unique(scDataAll$Combined)){
      GetTargets = scDataAll[which(scDataAll$Combined == Comb),]#subset(scDataAll, scDataAll$Combined == Comb)  
      GetTargets[is.na(GetTargets)] = 0
      TargetGenes = GetTargets[, GetGenes]#GetTargets %>% dplyr::select(GetGenes)
      SecGenes= GetTargets[, SecGenesList[[paste0("Iteration", iter)]]]#GetTargets %>% dplyr::select(SecGenesList[[paste0("Iteration", iter)]]) ###CompileSecondary
      AvgbyCell = rowMeans(TargetGenes)
      AvgbyCell_Secondary = rowMeans(SecGenes)
      if(sd(AvgbyCell) == sd(AvgbyCell_Secondary)){
        TresCompile = as.data.frame(t(c(Comb, mean(AvgbyCell), mean(AvgbyCell_Secondary),  1, 0)))
        colnames(TresCompile) = c("Combined", "AvgTarget", "AvgSecondary", "P.val", "meanDiff")
        CompileTRes = rbind(CompileTRes, TresCompile)   
      }else{
        Tres = t.test(AvgbyCell, AvgbyCell_Secondary, paired = TRUE, alternative = "two.sided")
        TresCompile = as.data.frame(t(c(Comb,  mean(AvgbyCell), mean(AvgbyCell_Secondary), as.numeric(Tres[[3]]), as.numeric(Tres[[5]]))))
        colnames(TresCompile) = c("Combined", "AvgTarget", "AvgSecondary", "P.val", "meanDiff")
        CompileTRes = rbind(CompileTRes, TresCompile)
      }
      i = i+1
      if(i %in% Progress){cat("#")}
    }
    
    CompileTRes$P.val = as.numeric(CompileTRes$P.val)  
    CompileTRes$P.val.adj = p.adjust(CompileTRes$P.val, "BH")  
    CompileTRes$meanDiff = as.numeric(CompileTRes$meanDiff)  
    CompileTRes$AvgTarget = as.numeric(CompileTRes$AvgTarget)  
    CompileTRes$AvgSecondary = as.numeric(CompileTRes$AvgSecondary)  
    
    #Calculate frequency of target genes for each population
    GeneData_t_Binary = as.data.frame(ifelse(scDataReduced > 0, 1, 0))
    GeneData_t_Binary[is.na(GeneData_t_Binary)] = 0
    BinaryTarget = GeneData_t_Binary[, GetGenes]
    BinarySecondary = GeneData_t_Binary[, SecGenesList[[paste0("Iteration", iter)]]]#GeneData_t_Binary %>% dplyr::select(SecGenesList[[paste0("Iteration", iter)]]) ###CompileSecondary
    SumBinary = as.data.frame(rowSums(BinaryTarget))
    colnames(SumBinary) = "FrequencyTarget"
    SumBinary$FrequencySecondary = rowSums(BinarySecondary)
    SumBinary$FrequencyDeltaSub = SumBinary$FrequencyTarget - SumBinary$FrequencySecondary
    SumBinary$FrequencyDeltaDiv = SumBinary$FrequencyTarget / SumBinary$FrequencySecondary
    SumBinary[is.na(SumBinary)] = 0
    SumBinary$FrequencyDeltaDiv = ifelse(is.infinite(SumBinary$FrequencyDeltaDiv) == T, SumBinary$FrequencyTarget, SumBinary$FrequencyDeltaDiv)
    
    #Generate a column with which genes are expressed in each cell population 
    SumBinary$TargetGenesExpressed=apply(BinaryTarget,1,function(x) paste(names(BinaryTarget)[which(x==1)], collapse="/"))
    SumBinary$SecondaryGenesExpressed=apply(BinarySecondary,1,function(x) paste(names(BinarySecondary)[which(x==1)], collapse="/"))
    
    TopDeltas = merge(CompileTRes, SumBinary,by.x = "Combined", by.y = 0)
    
    
    TopDeltas$FreqTarget_Percent = TopDeltas$FrequencyTarget/length(GetGenes)
    TopDeltas$FreqBackground_Percent = TopDeltas$FrequencySecondary/length(SecGenesList[[paste0("Iteration", iter)]]) ###CompileSecondary
    TopDeltas$Percent_DeltaSub = TopDeltas$FreqTarget_Percent-TopDeltas$FreqBackground_Percent
    TopDeltas = data.frame(TopDeltas[order(-TopDeltas$Percent_DeltaSub),]) 
    TopDeltas$Significant = ifelse(TopDeltas$P.val.adj < 0.05 & TopDeltas$FrequencyTarget > 1 & TopDeltas$Percent_DeltaSub > 0 & TopDeltas$meanDiff > 0, "Enriched", "")
    TopDeltas$Subclass = gsub("\\|.*", "", TopDeltas$Combined)
    TopDeltas$Class = gsub(".*\\|", "", TopDeltas$Combined)  
    
    nSig = subset(TopDeltas, TopDeltas$Significant == "Enriched")
    IterTTest[[iter]] = TopDeltas
    
    if(iter == 1){
      TopDeltasShort = TopDeltas %>% dplyr::select(c("Combined", "Percent_DeltaSub" , "meanDiff", "Significant", "FrequencyTarget"))
      TopDeltasShort$Iteration = "Iter1"
    }else{
      TopDeltasShortV2 = TopDeltas %>% dplyr::select(c("Combined", "Percent_DeltaSub" , "meanDiff", "Significant", "FrequencyTarget"))
      TopDeltasShortV2$Iteration = paste("Iter", iter)
      TopDeltasShort = rbind(TopDeltasShort, TopDeltasShortV2)
    }
  }
  
  
  TopDeltasShort$Binary = ifelse(TopDeltasShort$Significant == "Enriched", 1, 0)
  TopDeltasEnriched = TopDeltasShort %>% group_by(Combined) %>% dplyr::summarize("nEnriched" = sum(Binary), "avgMeanDiff" = mean(meanDiff), "avgPercent" = mean(Percent_DeltaSub), "avgFreq" = mean(FrequencyTarget))
  TopDeltasEnriched = TopDeltasEnriched[order(-TopDeltasEnriched$avgPercent),]
  TopDeltasEnriched$Significant = ifelse(TopDeltasEnriched$nEnriched == Iterations, "Enriched", "")
  
  setClass("GlowwormObj", representation(MetaData = "data.frame", GlowwormScaleOutput = "data.frame", RankScores = "data.frame",  SumStats = "data.frame", Settings = "list"))
  
  Output = new("GlowwormObj", 
               MetaData = OutputList[["MetaData"]], 
               GlowwormScaleOutput = OutputList[["GlowwormScaleOutput"]], 
               RankScores = TopDeltasEnriched, 
               SumStats = SumStats, 
               Settings = Settings)
  
  if(Verbose == T){cat(paste("###### | \nGlowwormScale complete for ", Settings$InputFile, "\nFrom ", length(GeneList), " input genes, ", length(GetGenes), " genes were processed. ", length(GenesNotIn), " genes were not in input data.\n",  BackgroundType, " background genes were used.\n", sep=""))}
  ##
  setMethod("show",signature = signature(object = "GlowwormObj"),
            function(object){
              print_message <- paste(c("\nGlowwormScale for ", Settings$InputFile, "\nFrom ", length(GeneList), " input genes, ", length(GetGenes), " genes were processed. ", length(GenesNotIn), " genes were not in input data.\n",  BackgroundType, " background genes were used.\n", dim(nSig)[1], "populations are significantly enriched.", sep=""))
              cat(print_message,sep = "")
            })
  return(Output)
}
Hannahglover/Glowworm documentation built on Jan. 16, 2024, 11:47 p.m.